# install packages
#install.packages("plyr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("ggmap")

# declare libraries
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)

# read in citibike data
citiBike <- read.csv("201511-citibike-tripdata.csv")

Patterns in Ride History Data

Identify patterns in the ride history data. Explain these patterns using appropriate visualization. Two potential patterns are (this is an illustrative list, you should formulate your own patterns as well to be explored here):

1. Which stations see the most asymmetric traffic (more arrivals than departures and vice versa)?

endStation<-as.data.frame(table(citiBike$end.station.id))
startStation<-as.data.frame(table(citiBike$start.station.id))
stationData <- merge(startStation, endStation, by="Var1", all=TRUE)
stationData[is.na(stationData)] <- 0
names(stationData) <- c("station.id", "start.freq", "end.freq")
stationData$inTraffic <- stationData$start.freq - stationData$end.freq
stationData$outTraffic <- stationData$end.freq - stationData$start.freq

The stations with the most outgoing traffic are:

outTraffic <- stationData[order(-stationData$outTraffic),]
barplot(outTraffic$outTraffic[outTraffic$outTraffic > 300], main="Bike Stations with Highest Net Outflow of Bikes", names.arg = outTraffic$station.id[outTraffic$outTraffic > 300], xlab = "Station IDs", ylab = "Net Outflow (Bikes)")

The stations with the most incoming traffic are:

inTraffic <- stationData[order(-stationData$inTraffic),]
barplot(inTraffic$inTraffic[inTraffic$inTraffic > 300], main="Bike Stations with Highest Net Inflow of Bikes", names.arg = inTraffic$station.id[inTraffic$inTraffic > 300], xlab = "Station IDs", ylab = "Net Inflow (Bikes)")

2. Which stations originate the longest rides? Does this change as we go through different times of the day?

longestRide <- aggregate(citiBike[, 1], list(citiBike$start.station.id), mean)
names(longestRide) <- c("station.id", "mean.trip.duration")
stationData <- merge(stationData, longestRide, by="station.id", all=TRUE)
stationData[is.na(stationData)] <- 0

longestRide <- stationData[order(-stationData$mean.trip.duration),]
barplot(longestRide$mean.trip.duration[longestRide$mean.trip.duration > 2000], main = "Stations that Originate the Longest Rides", names.arg = longestRide$station.id[longestRide$mean.trip.duration > 2000], xlab = "Station IDs", ylab = "Ride Duration")

citiBike$time.slot = strtoi(format(as.POSIXct(citiBike$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
citiBike$time.slot.category[citiBike$time.slot < 8] = "earlyMorning"
citiBike$time.slot.category[citiBike$time.slot < 12 & citiBike$time.slot >= 8] <- "morning"
citiBike$time.slot.category[citiBike$time.slot < 16 & citiBike$time.slot >= 12] <- "afternoon"
citiBike$time.slot.category[citiBike$time.slot < 20 & citiBike$time.slot >= 16] <- "evening"
citiBike$time.slot.category[citiBike$time.slot >= 20] <- "lateEvening"

longest.ride.category <- aggregate(citiBike[, 1], list(citiBike$start.station.id, citiBike$time.slot.category), mean)
names(longest.ride.category) <- c("station.id", "time.slot", "mean")


longest.ride.category <- longest.ride.category[order(-longest.ride.category$mean),]

early.morning.ride = longest.ride.category[longest.ride.category$time.slot == "earlyMorning",]
barplot(early.morning.ride$mean[early.morning.ride$mean > 2100], names.arg = early.morning.ride$station.id[early.morning.ride$mean > 2100], main="Stations that Originate the Longest Rides in the Early Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")

morning.ride = longest.ride.category[longest.ride.category$time.slot == "morning",]
barplot(morning.ride$mean[morning.ride$mean > 2100], names.arg = morning.ride$station.id[morning.ride$mean > 2100], main="Stations that Originate the Longest Rides in the Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")

afternoon.ride = longest.ride.category[longest.ride.category$time.slot == "afternoon",]
barplot(afternoon.ride$mean[afternoon.ride$mean > 2500], names.arg = afternoon.ride$station.id[afternoon.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Afternoon", xlab = "Station IDs", ylab = "Mean Ride Duration")

evening.ride = longest.ride.category[longest.ride.category$time.slot == "evening",]
barplot(evening.ride$mean[evening.ride$mean > 2100], names.arg = evening.ride$station.id[evening.ride$mean > 2100], main="Stations that Originate the Longest Rides in the Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")

late.evening.ride = longest.ride.category[longest.ride.category$time.slot == "lateEvening",]
barplot(late.evening.ride$mean[late.evening.ride$mean > 2500], names.arg = late.evening.ride$station.id[late.evening.ride$mean > 2500], main = "Stations that Originate the Longest Rides in the Late Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")

Dataset Visualization

# change

# ===============================================================================
# ===============================[calculations]==================================
# ===============================================================================

# =============================[most used routes]================================

# function returns a unique integer from two integer inputs
cantor_pairing_function <- function(a, b){
  return(0.5 * (a + b) * (a + b + 1) + b)
}

# get all routes taken
routes <- subset(citiBike, select = c(start.station.id, end.station.id, start.station.latitude, start.station.longitude, end.station.latitude, end.station.longitude))
colnames(routes) <- c("start.id", "end.id", "start.lat", "start.lon", "end.lat", "end.lon")

# apply unique route.id based on the cantor pairing function, with start.id and end.id as inputs
routes$route.id = cantor_pairing_function(routes$start.id, routes$end.id)

# count # of times a route has been used
route_counts <- routes %>%
  count(route.id)
colnames(route_counts) <- c("route.id", "route.count")

# get only unique routes
unique_routes <- unique(routes)

# merge route count with ids, lats, and longitudes
route_counts <- join(unique_routes, route_counts, by = "route.id", type = "full", match = "first")
route_counts <- subset(route_counts, select = -c(route.id) )

# sort by highest count 
route_counts <- route_counts[with(route_counts, order(-route.count)), ]

# =============================[station surplus]================================

# get arrivals to station
arrival_counts <- citiBike %>%
  count(end.station.id)
colnames(arrival_counts) <- c("id", "arr")

# get departures from station
departure_counts <- citiBike %>%
  count(start.station.id)
colnames(departure_counts) <- c("id", "dep")

# get unique stations
stations <- unique(subset(citiBike, select = c(start.station.id, start.station.latitude, start.station.longitude)))
colnames(stations) <- c("id", "lat", "lon")

# merge arrivals and departures with list of stations
stations <- join(stations, arrival_counts, by = "id", type = "full", match = "first")
stations <- join(stations, departure_counts, by = "id", type = "full", match = "first")

# calculate surplus of station
stations$surplus = stations$arr - stations$dep

# seperate surplus and deficit stations
stations.surplus <- subset(stations, surplus >= 0)
stations.deficit <- subset(stations, surplus <= 0)


# ===============================================================================
# ==================================[mapping]====================================
# ===============================================================================

min_lat <- min(c(min(citiBike$start.station.latitude), min(citiBike$end.station.latitude))) + 0.025
max_lat <- max(c(max(citiBike$start.station.latitude), max(citiBike$end.station.latitude))) + 0.0055
min_lon <- min(c(min(citiBike$start.station.longitude), min(citiBike$end.station.longitude))) + 0.029
max_lon <- max(c(max(citiBike$start.station.longitude), max(citiBike$end.station.longitude))) + 0.005

nyc <- c(left = min_lon, bottom = min_lat, right = max_lon, top = max_lat)
basemap <- get_map(nyc, zoom = 13, maptype = "terrain", source = "stamen")
## Map from URL : http://tile.stamen.com/terrain/13/2411/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3081.jpg
# =============================[most used routes]================================

# declare variables
routes_to_show <- 10
origin <- c()
dest <- c()

# get routes
for(i in 1:routes_to_show){
  origin <- append(origin, paste(routes$start.lat[i], routes$start.lon[i], sep=","))
  dest <- append(dest, paste(routes$end.lat[i], routes$end.lon[i], sep=","))
}

# initialize routes map
map.routes <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))

# build routes map
for(i in 1:(routes_to_show)){
  k <- (i - 1) / routes_to_show
  map.routes <- map.routes +
    geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
              color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74025878,-73.98409214&destination=40.71893904,-73.99266288&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74025878,-73.98409214&destination=40.71893904,-73.99266288&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74144387,-73.97536082&destination=40.74854862,-73.98808416&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72743423,-73.99379025&destination=40.72405549,-74.00965965&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.73454567,-73.99074142&destination=40.722103786686,-73.997249007225&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71910537,-73.99973337&destination=40.74394314,-73.97966069&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.749156,-73.9916&destination=40.74444921,-73.98303529&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.700469,-73.991454&destination=40.68926942,-73.98912867&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71542197,-74.01121978&destination=40.756014,-73.967416&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.750967348716,-73.994442075491&destination=40.73454567,-73.99074142&mode=driving&units=metric&alternatives=false&sensor=false
# display route map
map.routes

# =============================[station surplus]================================

# build surplus stations map
map.surplus <- ggmap(basemap, base_layer = ggplot(stations.surplus, aes(x = lon, y = lat))) +
  geom_point(aes(color = surplus, size = surplus), alpha = .8) +
  scale_color_gradient(low = "green", high = "darkgreen") +
  scale_size(range = c(0.5, 10))

# build deficit stations map
map.deficit <- ggmap(basemap, base_layer = ggplot(stations.deficit, aes(x = lon, y = lat))) +
  geom_point(aes(color = surplus, size = surplus), alpha = .8) +
  scale_color_gradient(low = "darkred", high = "red") +
  scale_size(range = c(30, 1))

# display surplus and deficit maps
map.surplus

map.deficit

Business Issues

Stations running out of bikes is a big problem. Client would want to know which stations are candidates for improving bike storage capacity.

# The stations that are start stations the most number of times are likely the best candidates for a higher bike storage capacity
startStationCount <- as.data.frame(table(citiBike$start.station.id))
startStationCount <- startStationCount[order(startStationCount$Freq, decreasing=TRUE),]
head(startStationCount)
##     Var1  Freq
## 286  519 11267
## 81   293  8810
## 206  435  8268
## 265  497  7740
## 77   285  7175
## 16   151  6172

Bike maintenance bills are piling up. Client thinks that this is because some bikes are being used a lot more than other bikes. Can you check on this assumption?

Client is planning a promotion to increase bike rental by younger folks (age < 25) and tourists (i.e not an annual member). They want to focus on stations that are less used by these target groups. Can you help them identify these stations?

####Client is planning a promotion to increase bike rental by younger folks (age < 25) and tourists (i.e not an annual member). They want to focus on stations that are less used by these target groups. Can you help them identify these stations?
#Qu estion for group: should I combine these two groups into one demographic, or leave them separate?
#Some birth dates are kept empty, can't do anything in those situations. 
head(citiBike)
##   tripduration          starttime           stoptime start.station.id
## 1         1110 11/1/2015 00:00:00 11/1/2015 00:18:31              537
## 2         1094 11/1/2015 00:00:01 11/1/2015 00:18:15              537
## 3          520 11/1/2015 00:00:05 11/1/2015 00:08:45              536
## 4          753 11/1/2015 00:00:15 11/1/2015 00:12:48              229
## 5          353 11/1/2015 00:00:22 11/1/2015 00:06:15              285
## 6         1285 11/1/2015 00:00:22 11/1/2015 00:21:48              268
##        start.station.name start.station.latitude start.station.longitude
## 1 Lexington Ave & E 24 St               40.74026               -73.98409
## 2 Lexington Ave & E 24 St               40.74026               -73.98409
## 3         1 Ave & E 30 St               40.74144               -73.97536
## 4          Great Jones St               40.72743               -73.99379
## 5      Broadway & E 14 St               40.73455               -73.99074
## 6   Howard St & Centre St               40.71911               -73.99973
##   end.station.id         end.station.name end.station.latitude
## 1            531   Forsyth St & Broome St             40.71894
## 2            531   Forsyth St & Broome St             40.71894
## 3            498       Broadway & W 32 St             40.74855
## 4            328  Watts St & Greenwich St             40.72406
## 5            151 Cleveland Pl & Spring St             40.72210
## 6            476          E 31 St & 3 Ave             40.74394
##   end.station.longitude bikeid   usertype birth.year gender time.slot
## 1             -73.99266  22545 Subscriber       1981      2         0
## 2             -73.99266  23959 Subscriber       1980      1         0
## 3             -73.98808  22251 Subscriber       1988      1         0
## 4             -74.00966  15869 Subscriber       1981      1         0
## 5             -73.99725  21645 Subscriber       1987      1         0
## 6             -73.97966  14788   Customer         NA      0         0
##   time.slot.category
## 1       earlyMorning
## 2       earlyMorning
## 3       earlyMorning
## 4       earlyMorning
## 5       earlyMorning
## 6       earlyMorning
young.customers <- subset(citiBike, citiBike$birth.year > 1991)
young.startStation <- as.data.frame(table(young.customers$start.station.id))
head(young.startStation)
##   Var1 Freq
## 1   72   65
## 2   79  106
## 3   82   21
## 4   83   35
## 5  116  172
## 6  119    8
sorted.young <- young.startStation[order(young.startStation$Freq),]
head(sorted.young)
##     Var1 Freq
## 330 3052    1
## 401 3127    1
## 448 3221    1
## 192  418    2
## 325 3044    2
## 366 3089    2
#It may be useful to classify stations "less used" as those with a certain frequency. This would mean taking a subset of sorted.young. 
#User type is either subscriber or customer, so I'm assuming tourists are customers
tourist.customers <- subset(citiBike,citiBike$usertype == "Customer")
tourist.startStation <- as.data.frame(table(tourist.customers$start.station.id))
head(tourist.startStation)
##   Var1 Freq
## 1   72  261
## 2   79  264
## 3   82  155
## 4   83  139
## 5  116  187
## 6  119   23
sorted.tourist <- tourist.startStation[order(tourist.startStation$Freq),]
head(sorted.tourist)
##     Var1 Freq
## 312 2005    1
## 334 3054    2
## 30   218    5
## 339 3059    5
## 330 3049    8
## 453 3221    8